library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lavaan)
## This is lavaan 0.6-16
## lavaan is FREE software! Please report any bugs.
library(semTools)
##
## ###############################################################################
## This is semTools 0.5-6
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
##
## Attaching package: 'semTools'
##
## The following object is masked from 'package:readr':
##
## clipboard
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
holdout <- arrow::read_feather("ignore.data-holdout-set-item-similarity-20230710-164559")
llm_holdout_meta <- arrow::read_feather("ignore.llmdata-holdout-set-item-similarity-20230710-164559.feather")
holdout_real <- holdout %>%
select(ItemStemIdA, ItemStemIdB, Pearson) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdA = ItemStemId, VariableA = Variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdB = ItemStemId, VariableB = Variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(ItemStemIdA)`
## Joining with `by = join_by(ItemStemIdB)`
holdout_llm <- holdout %>%
select(ItemStemIdA, ItemStemIdB, CosineSimilarity) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdA = ItemStemId, VariableA = Variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdB = ItemStemId, VariableB = Variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(ItemStemIdA)`
## Joining with `by = join_by(ItemStemIdB)`
cors_llm <- holdout_llm %>%
drop_na(InstrumentA, ScaleA, InstrumentB, ScaleB) %>%
select(x = VariableA, y = VariableB, r = CosineSimilarity) %>%
as.data.frame() |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_llm) <- 1
cors_real <- holdout_real %>%
drop_na(InstrumentA, ScaleA, InstrumentB, ScaleB) %>%
select(x = VariableA, y = VariableB, r = Pearson) %>%
as.data.frame() |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_real) <- 1
scales <- llm_holdout_meta %>%
drop_na(instrument, scale_0) %>%
mutate(scale = str_replace_all(paste(instrument, scale_0), "[^a-zA-Z_0-9]", "_")) %>%
group_by(scale) %>%
summarise(
items = list(Variable),
lvn = paste(first(scale), " =~ ", paste(Variable, collapse = " + "))) %>%
drop_na()
random_scales <- list()
for(i in 1:25) {
n_items <- rpois(1, 10)
random_scales[[i]] <- llm_holdout_meta %>%
drop_na(instrument, scale_0) %>%
sample_n(n_items) %>%
mutate(scale = paste0("random", i)) %>%
group_by(scale) %>%
summarise(
items = list(Variable),
lvn = paste(first(scale), " =~ ", paste(Variable, collapse = " + "))) %>%
drop_na()
}
random_scales <- bind_rows(random_scales)
scales <- bind_rows(scales, random_scales)
find_reverse_items <- function(rs) {
# Calculate the mean correlation for each item, excluding the item's correlation with itself.
mean_correlations <- apply(rs, 1, function(x) mean(x[-which(x == 1)]))
# Identify items with negative mean correlation
# You may adjust the threshold according to your specific criteria.
threshold <- -0.01
reverse_keyed_items <- names(which(mean_correlations < threshold))
# Now you know which items are likely to be reverse-coded.
reverse_keyed_items
}
find_reverse_items_by_first_item <- function(rs) {
# negatively correlated with first item
items <- rs[-1, 1]
reverse_keyed_items <- names(items)[which(items < 0)]
reverse_keyed_items
}
reverse_items <- function(rs, reverse_keyed_items) {
# Reverse the correlations for the reverse-keyed items
for (item in reverse_keyed_items) {
# Get the index of the reverse-keyed item
item_index <- which(rownames(rs) == item)
# Reverse the correlations
rs[item_index, ] <- rs[item_index, ] * -1
rs[, item_index] <- rs[, item_index] * -1
# Since the diagonal is the correlation of the item with itself, set it back to 1
rs[item_index, item_index] <- 1
}
rs
}
scales <- scales %>%
rowwise() %>%
mutate(r_real = list(cors_real[items, items]),
r_llm = list(cors_llm[items, items])) %>%
mutate(reverse_items = list(find_reverse_items_by_first_item(r_real)),
r_real_rev = list(reverse_items(r_real, reverse_items)),
r_llm_rev = list(reverse_items(r_llm, reverse_items))) %>%
mutate(
cfa_real = list(cfa(lvn, sample.cov = r_real_rev, sample.nobs = 1000)),
rel_real = semTools::compRelSEM(cfa_real)) %>%
mutate(
cfa_llm = list(cfa(lvn, sample.cov = r_llm_rev, sample.nobs = 1000)),
rel_llm = semTools::compRelSEM(cfa_llm))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `cfa_llm = list(cfa(lvn, sample.cov = r_llm_rev, sample.nobs =
## 1000))`.
## ℹ In row 27.
## Caused by warning in `lav_object_post_check()`:
## ! lavaan WARNING: some estimated ov variances are negative
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
scales <- scales %>% filter(between(as.vector(rel_llm), 0, 1)) %>% filter(between(as.vector(rel_real), 0, 1))
cor.test(scales$rel_llm, scales$rel_real)
##
## Pearson's product-moment correlation
##
## data: scales$rel_llm and scales$rel_real
## t = 9.0572, df = 48, p-value = 5.844e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6623552 0.8784135
## sample estimates:
## cor
## 0.7942677
scales %>% filter(scale == "LOT_Optimism") %>% pull(cfa_real) %>% .[[1]] %>% summary
## lavaan 0.6.16 ended normally after 33 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 12
##
## Number of observations 1000
##
## Model Test User Model:
##
## Test statistic 67.031
## Degrees of freedom 9
## P-value (Chi-square) 0.000
##
## Parameter Estimates:
##
## Standard errors Standard
## Information Expected
## Information saturated (h1) model Structured
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## LOT_Optimism =~
## opt_1 1.000
## opt_2 1.370 0.152 8.995 0.000
## opt_3 1.269 0.146 8.716 0.000
## opt_4 1.656 0.173 9.581 0.000
## opt_5 1.758 0.181 9.717 0.000
## opt_6 1.461 0.159 9.213 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .opt_1 0.854 0.041 20.977 0.000
## .opt_2 0.728 0.038 19.275 0.000
## .opt_3 0.766 0.039 19.854 0.000
## .opt_4 0.603 0.036 16.844 0.000
## .opt_5 0.552 0.035 15.583 0.000
## .opt_6 0.691 0.037 18.649 0.000
## LOT_Optimism 0.145 0.027 5.334 0.000
scales %>%
mutate(rel_real = round(rel_real, 2)) %>%
mutate(rel_llm = round(rel_llm, 2)) %>%
ggplot(., aes(rel_real, rel_llm, label = scale)) +
geom_abline(linetype = "dashed") +
geom_point() +
coord_fixed(xlim = c(0,1), ylim = c(0,1)) -> p
ggplotly(p)